{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import argparse\n",
    "import os\n",
    "import time\n",
    "import math\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.autograd import Variable\n",
    "from torchtext import data as d\n",
    "from torchtext import datasets\n",
    "from torchtext.vocab import GloVe\n",
    "import model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "is_cuda = torch.cuda.is_available()\n",
    "is_cuda"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "TEXT = d.Field(lower=True, batch_first=True,)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# make splits for data\n",
    "train, valid, test = datasets.WikiText2.splits(TEXT,root='data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "batch_size=20\n",
    "bptt_len=30\n",
    "clip = 0.25\n",
    "lr = 20\n",
    "log_interval = 200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "217640"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(len(valid[0].text)//batch_size)*batch_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "217646"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(valid[0].text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train[0].text = train[0].text[:(len(train[0].text)//batch_size)*batch_size]\n",
    "valid[0].text = valid[0].text[:(len(valid[0].text)//batch_size)*batch_size]\n",
    "test[0].text = test[0].text[:(len(valid[0].text)//batch_size)*batch_size]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "217640"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(valid[0].text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train.fields {'text': <torchtext.data.field.Field object at 0x7f81b0f1bf60>}\n",
      "len(train) 1\n",
      "vars(train[0]) ['<eos>', '=', 'valkyria', 'chronicles', 'iii', '=', '<eos>', '<eos>', 'senjō', 'no']\n"
     ]
    }
   ],
   "source": [
    "# print information about the data\n",
    "print('train.fields', train.fields)\n",
    "print('len(train)', len(train))\n",
    "print('vars(train[0])', vars(train[0])['text'][0:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "TEXT.build_vocab(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len(TEXT.vocab) 28913\n"
     ]
    }
   ],
   "source": [
    "print('len(TEXT.vocab)', len(TEXT.vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_iter, valid_iter, test_iter = d.BPTTIterator.splits((train, valid, test), batch_size=batch_size, bptt_len=bptt_len, device=0,repeat=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class RNNModel(nn.Module):\n",
    "    def __init__(self,ntoken,ninp,nhid,nlayers,dropout=0.5,tie_weights=False):\n",
    "        super().__init__()\n",
    "        self.drop = nn.Dropout()\n",
    "        self.encoder = nn.Embedding(ntoken,ninp)\n",
    "        self.rnn = nn.LSTM(ninp,nhid,nlayers,dropout=dropout)\n",
    "        self.decoder = nn.Linear(nhid,ntoken)\n",
    "        if tie_weights:\n",
    "            self.decoder.weight = self.encoder.weight\n",
    "        \n",
    "        self.init_weights()\n",
    "        self.nhid = nhid\n",
    "        self.nlayers = nlayers\n",
    "        \n",
    "    def init_weights(self):\n",
    "        initrange = 0.1\n",
    "        self.encoder.weight.data.uniform_(-initrange,initrange)\n",
    "        self.decoder.bias.data.fill_(0)\n",
    "        self.decoder.weight.data.uniform_(-initrange,initrange)\n",
    "        \n",
    "    def forward(self,input,hidden): \n",
    "        \n",
    "        emb = self.drop(self.encoder(input))\n",
    "        output,hidden = self.rnn(emb,hidden)\n",
    "        output = self.drop(output)\n",
    "        s = output.size()\n",
    "        decoded = self.decoder(output.view(s[0]*s[1],s[2]))\n",
    "        return decoded.view(s[0],s[1],decoded.size(1)),hidden\n",
    "    \n",
    "    def init_hidden(self,bsz):\n",
    "        weight = next(self.parameters()).data\n",
    "        return(Variable(weight.new(self.nlayers,bsz,self.nhid).zero_()),Variable(weight.new(self.nlayers,bsz,self.nhid).zero_()))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "criterion = nn.CrossEntropyLoss()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "217640"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(valid_iter.dataset[0].text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "emsize = 200\n",
    "nhid=200\n",
    "nlayers=2\n",
    "dropout = 0.2\n",
    "\n",
    "ntokens = len(TEXT.vocab)\n",
    "lstm = RNNModel(ntokens, emsize, nhid,nlayers, dropout, 'store_true')\n",
    "if is_cuda:\n",
    "    lstm = lstm.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def repackage_hidden(h):\n",
    "    \"\"\"Wraps hidden states in new Variables, to detach them from their history.\"\"\"\n",
    "    if type(h) == Variable:\n",
    "        return Variable(h.data)\n",
    "    else:\n",
    "        return tuple(repackage_hidden(v) for v in h)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "def evaluate(data_source):\n",
    "    # Turn on evaluation mode which disables dropout.\n",
    "    lstm.eval()\n",
    "    total_loss = 0   \n",
    "    hidden = lstm.init_hidden(batch_size)\n",
    "    for batch in data_source:        \n",
    "        data, targets = batch.text,batch.target.view(-1)\n",
    "        output, hidden = lstm(data, hidden)\n",
    "        output_flat = output.view(-1, ntokens)\n",
    "        total_loss += len(data) * criterion(output_flat, targets).data\n",
    "        hidden = repackage_hidden(hidden)\n",
    "    return total_loss[0]/(len(data_source.dataset[0].text)//batch_size) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def trainf():\n",
    "    # Turn on training mode which enables dropout.\n",
    "    lstm.train()\n",
    "    total_loss = 0\n",
    "    start_time = time.time()\n",
    "    hidden = lstm.init_hidden(batch_size)\n",
    "    for  i,batch in enumerate(train_iter):\n",
    "        data, targets = batch.text,batch.target.view(-1)\n",
    "        # Starting each batch, we detach the hidden state from how it was previously produced.\n",
    "        # If we didn't, the model would try backpropagating all the way to start of the dataset.\n",
    "        hidden = repackage_hidden(hidden)\n",
    "        lstm.zero_grad()\n",
    "        output, hidden = lstm(data, hidden)\n",
    "        loss = criterion(output.view(-1, ntokens), targets)\n",
    "        loss.backward()\n",
    "\n",
    "        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.\n",
    "        torch.nn.utils.clip_grad_norm(lstm.parameters(), clip)\n",
    "        for p in lstm.parameters():\n",
    "            p.data.add_(-lr, p.grad.data)\n",
    "\n",
    "        total_loss += loss.data\n",
    "\n",
    "        if i % log_interval == 0 and i > 0:\n",
    "            cur_loss = total_loss[0] / log_interval\n",
    "            elapsed = time.time() - start_time\n",
    "            (print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(epoch, i, len(train_iter), lr,elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))))\n",
    "            total_loss = 0\n",
    "            start_time = time.time()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| epoch   1 |   200/ 3481 batches | lr 20.00 | ms/batch 12.15 | loss  7.45 | ppl  1719.76\n",
      "| epoch   1 |   400/ 3481 batches | lr 20.00 | ms/batch  9.28 | loss  6.71 | ppl   816.60\n",
      "| epoch   1 |   600/ 3481 batches | lr 20.00 | ms/batch  9.14 | loss  6.36 | ppl   578.74\n",
      "| epoch   1 |   800/ 3481 batches | lr 20.00 | ms/batch  9.12 | loss  6.19 | ppl   487.93\n",
      "| epoch   1 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.11 | loss  6.11 | ppl   451.40\n",
      "| epoch   1 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.13 | loss  6.03 | ppl   416.71\n",
      "| epoch   1 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.13 | loss  5.99 | ppl   397.83\n",
      "| epoch   1 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.11 | loss  5.92 | ppl   371.53\n",
      "| epoch   1 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.11 | loss  5.92 | ppl   372.40\n",
      "| epoch   1 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.21 | loss  5.84 | ppl   344.93\n",
      "| epoch   1 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  5.81 | ppl   334.10\n",
      "| epoch   1 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.25 | loss  5.77 | ppl   319.38\n",
      "| epoch   1 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.26 | loss  5.66 | ppl   288.57\n",
      "| epoch   1 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.25 | loss  5.73 | ppl   308.31\n",
      "| epoch   1 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.26 | loss  5.72 | ppl   303.61\n",
      "| epoch   1 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.27 | loss  5.63 | ppl   279.86\n",
      "| epoch   1 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.26 | loss  5.60 | ppl   270.28\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/vishnu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning: generator 'BPTTIterator.__iter__' raised StopIteration\n",
      "  import sys\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "val loss 5.429313100188384\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch   1 | time: 33.77s | valid loss  5.43 | valid ppl   227.99\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch   2 |   200/ 3481 batches | lr 20.00 | ms/batch 11.32 | loss  5.63 | ppl   279.22\n",
      "| epoch   2 |   400/ 3481 batches | lr 20.00 | ms/batch  9.28 | loss  5.62 | ppl   276.09\n",
      "| epoch   2 |   600/ 3481 batches | lr 20.00 | ms/batch  9.28 | loss  5.56 | ppl   260.38\n",
      "| epoch   2 |   800/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  5.48 | ppl   239.33\n",
      "| epoch   2 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  5.52 | ppl   250.83\n",
      "| epoch   2 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.30 | loss  5.48 | ppl   239.87\n",
      "| epoch   2 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.29 | loss  5.50 | ppl   243.73\n",
      "| epoch   2 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.30 | loss  5.50 | ppl   244.61\n",
      "| epoch   2 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.30 | loss  5.53 | ppl   251.20\n",
      "| epoch   2 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  5.47 | ppl   238.41\n",
      "| epoch   2 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.29 | loss  5.47 | ppl   237.79\n",
      "| epoch   2 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.31 | loss  5.44 | ppl   231.08\n",
      "| epoch   2 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.32 | loss  5.36 | ppl   212.75\n",
      "| epoch   2 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.31 | loss  5.44 | ppl   229.90\n",
      "| epoch   2 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.43 | ppl   228.37\n",
      "| epoch   2 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.38 | ppl   217.47\n",
      "| epoch   2 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.34 | ppl   207.80\n",
      "val loss 5.213486146848006\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch   2 | time: 34.01s | valid loss  5.21 | valid ppl   183.73\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch   3 |   200/ 3481 batches | lr 20.00 | ms/batch 11.36 | loss  5.41 | ppl   223.78\n",
      "| epoch   3 |   400/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.42 | ppl   225.73\n",
      "| epoch   3 |   600/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.36 | ppl   212.47\n",
      "| epoch   3 |   800/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.29 | ppl   197.51\n",
      "| epoch   3 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.36 | ppl   212.98\n",
      "| epoch   3 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  5.31 | ppl   202.32\n",
      "| epoch   3 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.33 | ppl   206.49\n",
      "| epoch   3 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.36 | ppl   212.32\n",
      "| epoch   3 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.38 | ppl   217.51\n",
      "| epoch   3 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.34 | ppl   207.53\n",
      "| epoch   3 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.34 | ppl   209.19\n",
      "| epoch   3 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.32 | ppl   204.89\n",
      "| epoch   3 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.23 | ppl   187.34\n",
      "| epoch   3 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.32 | ppl   204.09\n",
      "| epoch   3 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.32 | ppl   203.37\n",
      "| epoch   3 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.28 | ppl   196.71\n",
      "| epoch   3 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.23 | ppl   186.61\n",
      "val loss 5.150966403809043\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch   3 | time: 34.11s | valid loss  5.15 | valid ppl   172.60\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch   4 |   200/ 3481 batches | lr 20.00 | ms/batch 11.36 | loss  5.30 | ppl   201.14\n",
      "| epoch   4 |   400/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.32 | ppl   203.68\n",
      "| epoch   4 |   600/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.26 | ppl   193.39\n",
      "| epoch   4 |   800/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.18 | ppl   177.42\n",
      "| epoch   4 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.27 | ppl   193.79\n",
      "| epoch   4 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.22 | ppl   185.61\n",
      "| epoch   4 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.25 | ppl   190.44\n",
      "| epoch   4 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.28 | ppl   195.65\n",
      "| epoch   4 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.31 | ppl   201.49\n",
      "| epoch   4 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.26 | ppl   191.95\n",
      "| epoch   4 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.27 | ppl   193.98\n",
      "| epoch   4 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.25 | ppl   190.47\n",
      "| epoch   4 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.16 | ppl   173.38\n",
      "| epoch   4 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.24 | ppl   188.79\n",
      "| epoch   4 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.23 | ppl   187.21\n",
      "| epoch   4 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.21 | ppl   183.22\n",
      "| epoch   4 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.16 | ppl   174.26\n",
      "val loss 5.084544718342216\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch   4 | time: 34.06s | valid loss  5.08 | valid ppl   161.51\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch   5 |   200/ 3481 batches | lr 20.00 | ms/batch 11.49 | loss  5.25 | ppl   189.73\n",
      "| epoch   5 |   400/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.26 | ppl   191.74\n",
      "| epoch   5 |   600/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.20 | ppl   180.47\n",
      "| epoch   5 |   800/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.12 | ppl   166.81\n",
      "| epoch   5 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.21 | ppl   183.33\n",
      "| epoch   5 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.17 | ppl   175.66\n",
      "| epoch   5 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.53 | loss  5.20 | ppl   180.53\n",
      "| epoch   5 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.27 | loss  5.22 | ppl   185.18\n",
      "| epoch   5 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.27 | loss  5.25 | ppl   190.20\n",
      "| epoch   5 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.28 | loss  5.20 | ppl   182.05\n",
      "| epoch   5 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.27 | loss  5.22 | ppl   185.67\n",
      "| epoch   5 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.27 | loss  5.21 | ppl   182.19\n",
      "| epoch   5 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.28 | loss  5.11 | ppl   165.15\n",
      "| epoch   5 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.28 | loss  5.19 | ppl   179.06\n",
      "| epoch   5 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.29 | loss  5.19 | ppl   179.18\n",
      "| epoch   5 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.29 | loss  5.16 | ppl   174.57\n",
      "| epoch   5 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.11 | ppl   165.96\n",
      "val loss 5.02601342813821\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch   5 | time: 34.01s | valid loss  5.03 | valid ppl   152.32\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch   6 |   200/ 3481 batches | lr 20.00 | ms/batch 11.25 | loss  5.19 | ppl   179.97\n",
      "| epoch   6 |   400/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.21 | ppl   183.38\n",
      "| epoch   6 |   600/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.15 | ppl   172.76\n",
      "| epoch   6 |   800/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.07 | ppl   159.38\n",
      "| epoch   6 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.37 | loss  5.16 | ppl   174.78\n",
      "| epoch   6 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  5.12 | ppl   167.72\n",
      "| epoch   6 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.37 | loss  5.15 | ppl   172.30\n",
      "| epoch   6 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.18 | ppl   177.84\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| epoch   6 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.21 | ppl   183.73\n",
      "| epoch   6 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.17 | ppl   175.34\n",
      "| epoch   6 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.32 | loss  5.18 | ppl   178.14\n",
      "| epoch   6 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.16 | ppl   173.92\n",
      "| epoch   6 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.07 | ppl   158.48\n",
      "| epoch   6 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.15 | ppl   171.85\n",
      "| epoch   6 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.37 | loss  5.14 | ppl   171.37\n",
      "| epoch   6 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.12 | ppl   167.27\n",
      "| epoch   6 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.08 | ppl   160.14\n",
      "val loss 5.006427974522147\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch   6 | time: 34.11s | valid loss  5.01 | valid ppl   149.37\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch   7 |   200/ 3481 batches | lr 20.00 | ms/batch 11.44 | loss  5.16 | ppl   174.18\n",
      "| epoch   7 |   400/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  5.18 | ppl   177.50\n",
      "| epoch   7 |   600/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.11 | ppl   166.45\n",
      "| epoch   7 |   800/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  5.04 | ppl   153.92\n",
      "| epoch   7 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  5.13 | ppl   169.83\n",
      "| epoch   7 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.09 | ppl   161.95\n",
      "| epoch   7 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.12 | ppl   166.93\n",
      "| epoch   7 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.15 | ppl   173.24\n",
      "| epoch   7 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.18 | ppl   177.26\n",
      "| epoch   7 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.13 | ppl   169.83\n",
      "| epoch   7 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.16 | ppl   173.34\n",
      "| epoch   7 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.13 | ppl   169.23\n",
      "| epoch   7 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  5.03 | ppl   152.92\n",
      "| epoch   7 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.12 | ppl   166.63\n",
      "| epoch   7 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.12 | ppl   166.99\n",
      "| epoch   7 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  5.09 | ppl   163.10\n",
      "| epoch   7 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  5.05 | ppl   156.00\n",
      "val loss 4.987114256915089\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch   7 | time: 34.33s | valid loss  4.99 | valid ppl   146.51\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch   8 |   200/ 3481 batches | lr 20.00 | ms/batch 12.24 | loss  5.13 | ppl   168.38\n",
      "| epoch   8 |   400/ 3481 batches | lr 20.00 | ms/batch  9.59 | loss  5.15 | ppl   172.37\n",
      "| epoch   8 |   600/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.09 | ppl   161.60\n",
      "| epoch   8 |   800/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.00 | ppl   148.85\n",
      "| epoch   8 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.11 | ppl   165.77\n",
      "| epoch   8 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.06 | ppl   158.26\n",
      "| epoch   8 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.10 | ppl   163.21\n",
      "| epoch   8 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.33 | loss  5.13 | ppl   169.28\n",
      "| epoch   8 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.37 | loss  5.16 | ppl   173.92\n",
      "| epoch   8 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  5.11 | ppl   165.50\n",
      "| epoch   8 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.13 | ppl   169.52\n",
      "| epoch   8 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  5.11 | ppl   165.74\n",
      "| epoch   8 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.01 | ppl   149.90\n",
      "| epoch   8 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.09 | ppl   163.10\n",
      "| epoch   8 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.09 | ppl   162.83\n",
      "| epoch   8 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  5.08 | ppl   160.06\n",
      "| epoch   8 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.03 | ppl   152.20\n",
      "val loss 4.973778731506157\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch   8 | time: 34.41s | valid loss  4.97 | valid ppl   144.57\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch   9 |   200/ 3481 batches | lr 20.00 | ms/batch 11.38 | loss  5.11 | ppl   164.98\n",
      "| epoch   9 |   400/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  5.12 | ppl   168.15\n",
      "| epoch   9 |   600/ 3481 batches | lr 20.00 | ms/batch  9.59 | loss  5.06 | ppl   157.08\n",
      "| epoch   9 |   800/ 3481 batches | lr 20.00 | ms/batch  9.56 | loss  4.98 | ppl   145.42\n",
      "| epoch   9 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.09 | ppl   161.71\n",
      "| epoch   9 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.48 | loss  5.04 | ppl   153.78\n",
      "| epoch   9 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  5.07 | ppl   159.70\n",
      "| epoch   9 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.10 | ppl   164.38\n",
      "| epoch   9 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  5.13 | ppl   169.33\n",
      "| epoch   9 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.09 | ppl   162.73\n",
      "| epoch   9 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.11 | ppl   165.30\n",
      "| epoch   9 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.08 | ppl   161.28\n",
      "| epoch   9 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  4.99 | ppl   146.52\n",
      "| epoch   9 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  5.07 | ppl   159.78\n",
      "| epoch   9 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  5.07 | ppl   159.58\n",
      "| epoch   9 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.06 | ppl   156.91\n",
      "| epoch   9 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.00 | ppl   148.82\n",
      "val loss 4.9533701726474915\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch   9 | time: 34.42s | valid loss  4.95 | valid ppl   141.65\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  10 |   200/ 3481 batches | lr 20.00 | ms/batch 11.54 | loss  5.08 | ppl   161.50\n",
      "| epoch  10 |   400/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.10 | ppl   164.58\n",
      "| epoch  10 |   600/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.04 | ppl   155.04\n",
      "| epoch  10 |   800/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  4.96 | ppl   142.83\n",
      "| epoch  10 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.56 | loss  5.07 | ppl   158.67\n",
      "| epoch  10 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.03 | ppl   152.30\n",
      "| epoch  10 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.06 | ppl   156.88\n",
      "| epoch  10 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  5.09 | ppl   162.14\n",
      "| epoch  10 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.65 | loss  5.11 | ppl   166.18\n",
      "| epoch  10 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.08 | ppl   160.07\n",
      "| epoch  10 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  5.09 | ppl   161.90\n",
      "| epoch  10 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.06 | ppl   158.29\n",
      "| epoch  10 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  4.97 | ppl   143.49\n",
      "| epoch  10 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  5.06 | ppl   156.85\n",
      "| epoch  10 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.05 | ppl   156.47\n",
      "| epoch  10 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.04 | ppl   153.89\n",
      "| epoch  10 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  4.99 | ppl   146.28\n",
      "val loss 4.948440874264841\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  10 | time: 34.50s | valid loss  4.95 | valid ppl   140.96\n",
      "-----------------------------------------------------------------------------------------\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| epoch  11 |   200/ 3481 batches | lr 20.00 | ms/batch 11.40 | loss  5.07 | ppl   158.85\n",
      "| epoch  11 |   400/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.08 | ppl   160.87\n",
      "| epoch  11 |   600/ 3481 batches | lr 20.00 | ms/batch  9.52 | loss  5.03 | ppl   152.22\n",
      "| epoch  11 |   800/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  4.94 | ppl   139.70\n",
      "| epoch  11 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.56 | loss  5.05 | ppl   155.53\n",
      "| epoch  11 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.00 | ppl   148.91\n",
      "| epoch  11 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.51 | loss  5.04 | ppl   154.96\n",
      "| epoch  11 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.07 | ppl   159.64\n",
      "| epoch  11 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.10 | ppl   163.45\n",
      "| epoch  11 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.06 | ppl   157.49\n",
      "| epoch  11 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.08 | ppl   160.64\n",
      "| epoch  11 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  5.05 | ppl   156.48\n",
      "| epoch  11 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  4.96 | ppl   142.26\n",
      "| epoch  11 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.04 | ppl   153.95\n",
      "| epoch  11 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.04 | ppl   154.73\n",
      "| epoch  11 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  5.02 | ppl   151.59\n",
      "| epoch  11 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  4.97 | ppl   144.30\n",
      "val loss 4.929915442358942\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  11 | time: 34.43s | valid loss  4.93 | valid ppl   138.37\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  12 |   200/ 3481 batches | lr 20.00 | ms/batch 11.88 | loss  5.05 | ppl   155.30\n",
      "| epoch  12 |   400/ 3481 batches | lr 20.00 | ms/batch  9.67 | loss  5.07 | ppl   159.23\n",
      "| epoch  12 |   600/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.01 | ppl   149.85\n",
      "| epoch  12 |   800/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  4.92 | ppl   137.12\n",
      "| epoch  12 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.03 | ppl   153.49\n",
      "| epoch  12 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.50 | loss  4.99 | ppl   146.27\n",
      "| epoch  12 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  5.02 | ppl   151.12\n",
      "| epoch  12 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.06 | ppl   156.87\n",
      "| epoch  12 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  5.08 | ppl   161.10\n",
      "| epoch  12 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.05 | ppl   155.29\n",
      "| epoch  12 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.07 | ppl   158.49\n",
      "| epoch  12 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  5.04 | ppl   154.07\n",
      "| epoch  12 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  4.93 | ppl   138.90\n",
      "| epoch  12 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.02 | ppl   151.67\n",
      "| epoch  12 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.48 | loss  5.03 | ppl   152.17\n",
      "| epoch  12 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.48 | loss  5.01 | ppl   149.84\n",
      "| epoch  12 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  4.96 | ppl   142.49\n",
      "val loss 4.925156652040066\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  12 | time: 34.51s | valid loss  4.93 | valid ppl   137.71\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  13 |   200/ 3481 batches | lr 20.00 | ms/batch 11.44 | loss  5.04 | ppl   154.34\n",
      "| epoch  13 |   400/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.05 | ppl   156.52\n",
      "| epoch  13 |   600/ 3481 batches | lr 20.00 | ms/batch  9.37 | loss  4.99 | ppl   147.67\n",
      "| epoch  13 |   800/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  4.91 | ppl   136.18\n",
      "| epoch  13 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  5.02 | ppl   150.87\n",
      "| epoch  13 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.34 | loss  4.98 | ppl   145.86\n",
      "| epoch  13 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.02 | ppl   150.72\n",
      "| epoch  13 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  5.04 | ppl   155.19\n",
      "| epoch  13 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  5.07 | ppl   158.75\n",
      "| epoch  13 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  5.03 | ppl   153.09\n",
      "| epoch  13 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.05 | ppl   156.72\n",
      "| epoch  13 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.02 | ppl   152.05\n",
      "| epoch  13 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  4.92 | ppl   137.61\n",
      "| epoch  13 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.01 | ppl   149.30\n",
      "| epoch  13 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  5.01 | ppl   149.65\n",
      "| epoch  13 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  4.99 | ppl   147.47\n",
      "| epoch  13 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.35 | loss  4.95 | ppl   140.52\n",
      "val loss 4.921083124655394\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  13 | time: 34.22s | valid loss  4.92 | valid ppl   137.15\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  14 |   200/ 3481 batches | lr 20.00 | ms/batch 11.33 | loss  5.03 | ppl   152.27\n",
      "| epoch  14 |   400/ 3481 batches | lr 20.00 | ms/batch  9.37 | loss  5.04 | ppl   154.79\n",
      "| epoch  14 |   600/ 3481 batches | lr 20.00 | ms/batch  9.28 | loss  4.98 | ppl   145.86\n",
      "| epoch  14 |   800/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  4.90 | ppl   133.71\n",
      "| epoch  14 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.37 | loss  5.00 | ppl   148.49\n",
      "| epoch  14 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  4.97 | ppl   143.43\n",
      "| epoch  14 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  5.00 | ppl   148.05\n",
      "| epoch  14 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.03 | ppl   153.25\n",
      "| epoch  14 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.06 | ppl   157.37\n",
      "| epoch  14 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.02 | ppl   151.26\n",
      "| epoch  14 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.04 | ppl   154.57\n",
      "| epoch  14 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  5.01 | ppl   150.07\n",
      "| epoch  14 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  4.91 | ppl   136.25\n",
      "| epoch  14 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.00 | ppl   147.84\n",
      "| epoch  14 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.00 | ppl   148.68\n",
      "| epoch  14 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  4.99 | ppl   146.88\n",
      "| epoch  14 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.36 | loss  4.94 | ppl   139.28\n",
      "val loss 4.920271865236169\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  14 | time: 34.23s | valid loss  4.92 | valid ppl   137.04\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  15 |   200/ 3481 batches | lr 20.00 | ms/batch 11.34 | loss  5.01 | ppl   150.49\n",
      "| epoch  15 |   400/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.03 | ppl   153.39\n",
      "| epoch  15 |   600/ 3481 batches | lr 20.00 | ms/batch  9.37 | loss  4.97 | ppl   144.15\n",
      "| epoch  15 |   800/ 3481 batches | lr 20.00 | ms/batch  9.48 | loss  4.89 | ppl   132.47\n",
      "| epoch  15 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  4.99 | ppl   147.63\n",
      "| epoch  15 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  4.95 | ppl   141.38\n",
      "| epoch  15 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.37 | loss  4.99 | ppl   146.58\n",
      "| epoch  15 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  5.01 | ppl   150.53\n",
      "| epoch  15 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.76 | loss  5.05 | ppl   155.80\n",
      "| epoch  15 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  5.01 | ppl   149.36\n",
      "| epoch  15 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  5.03 | ppl   152.41\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| epoch  15 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  5.00 | ppl   148.97\n",
      "| epoch  15 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  4.90 | ppl   134.61\n",
      "| epoch  15 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  4.99 | ppl   146.89\n",
      "| epoch  15 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.43 | loss  4.99 | ppl   147.16\n",
      "| epoch  15 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.48 | loss  4.97 | ppl   144.25\n",
      "| epoch  15 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  4.92 | ppl   137.52\n",
      "val loss 4.902327955568829\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  15 | time: 34.40s | valid loss  4.90 | valid ppl   134.60\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  16 |   200/ 3481 batches | lr 20.00 | ms/batch 11.45 | loss  5.00 | ppl   148.31\n",
      "| epoch  16 |   400/ 3481 batches | lr 20.00 | ms/batch  9.40 | loss  5.02 | ppl   151.58\n",
      "| epoch  16 |   600/ 3481 batches | lr 20.00 | ms/batch  9.39 | loss  4.96 | ppl   142.07\n",
      "| epoch  16 |   800/ 3481 batches | lr 20.00 | ms/batch  9.41 | loss  4.87 | ppl   130.91\n",
      "| epoch  16 |  1000/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  4.98 | ppl   145.82\n",
      "| epoch  16 |  1200/ 3481 batches | lr 20.00 | ms/batch  9.51 | loss  4.94 | ppl   140.22\n",
      "| epoch  16 |  1400/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  4.98 | ppl   145.11\n",
      "| epoch  16 |  1600/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  5.01 | ppl   149.70\n",
      "| epoch  16 |  1800/ 3481 batches | lr 20.00 | ms/batch  9.47 | loss  5.04 | ppl   153.70\n",
      "| epoch  16 |  2000/ 3481 batches | lr 20.00 | ms/batch  9.51 | loss  5.00 | ppl   148.68\n",
      "| epoch  16 |  2200/ 3481 batches | lr 20.00 | ms/batch  9.44 | loss  5.02 | ppl   151.26\n",
      "| epoch  16 |  2400/ 3481 batches | lr 20.00 | ms/batch  9.45 | loss  4.99 | ppl   147.43\n",
      "| epoch  16 |  2600/ 3481 batches | lr 20.00 | ms/batch  9.46 | loss  4.90 | ppl   133.80\n",
      "| epoch  16 |  2800/ 3481 batches | lr 20.00 | ms/batch  9.38 | loss  4.98 | ppl   145.13\n",
      "| epoch  16 |  3000/ 3481 batches | lr 20.00 | ms/batch  9.53 | loss  4.98 | ppl   145.12\n",
      "| epoch  16 |  3200/ 3481 batches | lr 20.00 | ms/batch  9.42 | loss  4.96 | ppl   143.27\n",
      "| epoch  16 |  3400/ 3481 batches | lr 20.00 | ms/batch  9.50 | loss  4.91 | ppl   136.04\n",
      "val loss 4.912909506524536\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  16 | time: 34.44s | valid loss  4.91 | valid ppl   136.03\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  17 |   200/ 3481 batches | lr 5.00 | ms/batch 11.43 | loss  5.02 | ppl   151.95\n",
      "| epoch  17 |   400/ 3481 batches | lr 5.00 | ms/batch  9.50 | loss  5.03 | ppl   152.63\n",
      "| epoch  17 |   600/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.95 | ppl   141.16\n",
      "| epoch  17 |   800/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.87 | ppl   130.54\n",
      "| epoch  17 |  1000/ 3481 batches | lr 5.00 | ms/batch  9.48 | loss  4.97 | ppl   143.42\n",
      "| epoch  17 |  1200/ 3481 batches | lr 5.00 | ms/batch  9.49 | loss  4.90 | ppl   134.94\n",
      "| epoch  17 |  1400/ 3481 batches | lr 5.00 | ms/batch  9.50 | loss  4.92 | ppl   137.41\n",
      "| epoch  17 |  1600/ 3481 batches | lr 5.00 | ms/batch  9.51 | loss  4.95 | ppl   140.72\n",
      "| epoch  17 |  1800/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.96 | ppl   142.18\n",
      "| epoch  17 |  2000/ 3481 batches | lr 5.00 | ms/batch  9.49 | loss  4.92 | ppl   137.50\n",
      "| epoch  17 |  2200/ 3481 batches | lr 5.00 | ms/batch  9.48 | loss  4.93 | ppl   138.54\n",
      "| epoch  17 |  2400/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.89 | ppl   133.48\n",
      "| epoch  17 |  2600/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.78 | ppl   119.31\n",
      "| epoch  17 |  2800/ 3481 batches | lr 5.00 | ms/batch  9.51 | loss  4.85 | ppl   127.78\n",
      "| epoch  17 |  3000/ 3481 batches | lr 5.00 | ms/batch  9.48 | loss  4.86 | ppl   128.40\n",
      "| epoch  17 |  3200/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.83 | ppl   125.66\n",
      "| epoch  17 |  3400/ 3481 batches | lr 5.00 | ms/batch  9.48 | loss  4.77 | ppl   117.83\n",
      "val loss 4.796092457728359\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  17 | time: 34.53s | valid loss  4.80 | valid ppl   121.04\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  18 |   200/ 3481 batches | lr 5.00 | ms/batch 11.43 | loss  4.92 | ppl   136.52\n",
      "| epoch  18 |   400/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.94 | ppl   139.53\n",
      "| epoch  18 |   600/ 3481 batches | lr 5.00 | ms/batch  9.54 | loss  4.87 | ppl   129.77\n",
      "| epoch  18 |   800/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.79 | ppl   120.60\n",
      "| epoch  18 |  1000/ 3481 batches | lr 5.00 | ms/batch  9.42 | loss  4.89 | ppl   133.32\n",
      "| epoch  18 |  1200/ 3481 batches | lr 5.00 | ms/batch  9.41 | loss  4.84 | ppl   126.15\n",
      "| epoch  18 |  1400/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.87 | ppl   129.69\n",
      "| epoch  18 |  1600/ 3481 batches | lr 5.00 | ms/batch  9.68 | loss  4.90 | ppl   134.40\n",
      "| epoch  18 |  1800/ 3481 batches | lr 5.00 | ms/batch  9.42 | loss  4.91 | ppl   135.83\n",
      "| epoch  18 |  2000/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.88 | ppl   131.68\n",
      "| epoch  18 |  2200/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.90 | ppl   133.91\n",
      "| epoch  18 |  2400/ 3481 batches | lr 5.00 | ms/batch  9.42 | loss  4.86 | ppl   129.31\n",
      "| epoch  18 |  2600/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.76 | ppl   116.50\n",
      "| epoch  18 |  2800/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.82 | ppl   124.14\n",
      "| epoch  18 |  3000/ 3481 batches | lr 5.00 | ms/batch  9.38 | loss  4.83 | ppl   125.52\n",
      "| epoch  18 |  3200/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.81 | ppl   123.16\n",
      "| epoch  18 |  3400/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.76 | ppl   116.20\n",
      "val loss 4.7849659988972615\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  18 | time: 34.44s | valid loss  4.78 | valid ppl   119.70\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  19 |   200/ 3481 batches | lr 5.00 | ms/batch 11.41 | loss  4.88 | ppl   132.07\n",
      "| epoch  19 |   400/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.91 | ppl   135.29\n",
      "| epoch  19 |   600/ 3481 batches | lr 5.00 | ms/batch  9.53 | loss  4.84 | ppl   125.85\n",
      "| epoch  19 |   800/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.75 | ppl   116.05\n",
      "| epoch  19 |  1000/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.87 | ppl   130.13\n",
      "| epoch  19 |  1200/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.81 | ppl   122.67\n",
      "| epoch  19 |  1400/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.84 | ppl   127.01\n",
      "| epoch  19 |  1600/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.87 | ppl   130.68\n",
      "| epoch  19 |  1800/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.89 | ppl   132.74\n",
      "| epoch  19 |  2000/ 3481 batches | lr 5.00 | ms/batch  9.42 | loss  4.86 | ppl   129.16\n",
      "| epoch  19 |  2200/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.88 | ppl   131.91\n",
      "| epoch  19 |  2400/ 3481 batches | lr 5.00 | ms/batch  9.41 | loss  4.84 | ppl   127.03\n",
      "| epoch  19 |  2600/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.74 | ppl   114.70\n",
      "| epoch  19 |  2800/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.81 | ppl   122.16\n",
      "| epoch  19 |  3000/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.83 | ppl   124.61\n",
      "| epoch  19 |  3200/ 3481 batches | lr 5.00 | ms/batch  9.42 | loss  4.81 | ppl   122.30\n",
      "| epoch  19 |  3400/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.75 | ppl   115.13\n",
      "val loss 4.776596386234148\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  19 | time: 34.40s | valid loss  4.78 | valid ppl   118.70\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  20 |   200/ 3481 batches | lr 5.00 | ms/batch 11.36 | loss  4.86 | ppl   128.48\n",
      "| epoch  20 |   400/ 3481 batches | lr 5.00 | ms/batch  9.42 | loss  4.88 | ppl   132.06\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| epoch  20 |   600/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.81 | ppl   123.16\n",
      "| epoch  20 |   800/ 3481 batches | lr 5.00 | ms/batch  9.51 | loss  4.74 | ppl   114.07\n",
      "| epoch  20 |  1000/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.84 | ppl   126.77\n",
      "| epoch  20 |  1200/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.79 | ppl   120.53\n",
      "| epoch  20 |  1400/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.83 | ppl   125.15\n",
      "| epoch  20 |  1600/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.85 | ppl   128.27\n",
      "| epoch  20 |  1800/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.88 | ppl   131.02\n",
      "| epoch  20 |  2000/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.85 | ppl   127.25\n",
      "| epoch  20 |  2200/ 3481 batches | lr 5.00 | ms/batch  9.48 | loss  4.86 | ppl   129.30\n",
      "| epoch  20 |  2400/ 3481 batches | lr 5.00 | ms/batch  9.42 | loss  4.83 | ppl   125.71\n",
      "| epoch  20 |  2600/ 3481 batches | lr 5.00 | ms/batch  9.48 | loss  4.72 | ppl   112.53\n",
      "| epoch  20 |  2800/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.80 | ppl   121.03\n",
      "| epoch  20 |  3000/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.80 | ppl   122.11\n",
      "| epoch  20 |  3200/ 3481 batches | lr 5.00 | ms/batch  9.38 | loss  4.80 | ppl   121.34\n",
      "| epoch  20 |  3400/ 3481 batches | lr 5.00 | ms/batch  9.39 | loss  4.74 | ppl   114.30\n",
      "val loss 4.768469074503768\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  20 | time: 34.42s | valid loss  4.77 | valid ppl   117.74\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  21 |   200/ 3481 batches | lr 5.00 | ms/batch 11.45 | loss  4.85 | ppl   127.20\n",
      "| epoch  21 |   400/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.87 | ppl   129.79\n",
      "| epoch  21 |   600/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.80 | ppl   121.93\n",
      "| epoch  21 |   800/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.72 | ppl   111.63\n",
      "| epoch  21 |  1000/ 3481 batches | lr 5.00 | ms/batch  9.60 | loss  4.83 | ppl   125.70\n",
      "| epoch  21 |  1200/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.78 | ppl   119.08\n",
      "| epoch  21 |  1400/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.81 | ppl   123.25\n",
      "| epoch  21 |  1600/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.85 | ppl   127.12\n",
      "| epoch  21 |  1800/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.86 | ppl   129.30\n",
      "| epoch  21 |  2000/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.84 | ppl   126.02\n",
      "| epoch  21 |  2200/ 3481 batches | lr 5.00 | ms/batch  9.48 | loss  4.85 | ppl   127.85\n",
      "| epoch  21 |  2400/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.83 | ppl   124.59\n",
      "| epoch  21 |  2600/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.71 | ppl   111.29\n",
      "| epoch  21 |  2800/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.78 | ppl   119.64\n",
      "| epoch  21 |  3000/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.80 | ppl   122.10\n",
      "| epoch  21 |  3200/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.78 | ppl   119.50\n",
      "| epoch  21 |  3400/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.72 | ppl   112.68\n",
      "val loss 4.764531953570116\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  21 | time: 34.46s | valid loss  4.76 | valid ppl   117.28\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  22 |   200/ 3481 batches | lr 5.00 | ms/batch 11.36 | loss  4.83 | ppl   125.05\n",
      "| epoch  22 |   400/ 3481 batches | lr 5.00 | ms/batch  9.36 | loss  4.85 | ppl   127.85\n",
      "| epoch  22 |   600/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.79 | ppl   120.62\n",
      "| epoch  22 |   800/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.70 | ppl   110.49\n",
      "| epoch  22 |  1000/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.82 | ppl   123.62\n",
      "| epoch  22 |  1200/ 3481 batches | lr 5.00 | ms/batch  9.57 | loss  4.77 | ppl   118.02\n",
      "| epoch  22 |  1400/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.80 | ppl   121.99\n",
      "| epoch  22 |  1600/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.84 | ppl   125.87\n",
      "| epoch  22 |  1800/ 3481 batches | lr 5.00 | ms/batch  9.37 | loss  4.85 | ppl   128.08\n",
      "| epoch  22 |  2000/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.82 | ppl   124.36\n",
      "| epoch  22 |  2200/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.84 | ppl   127.04\n",
      "| epoch  22 |  2400/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.81 | ppl   123.00\n",
      "| epoch  22 |  2600/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.70 | ppl   109.98\n",
      "| epoch  22 |  2800/ 3481 batches | lr 5.00 | ms/batch  9.39 | loss  4.78 | ppl   118.84\n",
      "| epoch  22 |  3000/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.79 | ppl   120.69\n",
      "| epoch  22 |  3200/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.78 | ppl   118.81\n",
      "| epoch  22 |  3400/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.72 | ppl   112.26\n",
      "val loss 4.764281037493108\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  22 | time: 34.35s | valid loss  4.76 | valid ppl   117.25\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  23 |   200/ 3481 batches | lr 5.00 | ms/batch 11.41 | loss  4.82 | ppl   124.58\n",
      "| epoch  23 |   400/ 3481 batches | lr 5.00 | ms/batch  9.38 | loss  4.85 | ppl   127.13\n",
      "| epoch  23 |   600/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.78 | ppl   118.76\n",
      "| epoch  23 |   800/ 3481 batches | lr 5.00 | ms/batch  9.48 | loss  4.70 | ppl   109.49\n",
      "| epoch  23 |  1000/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.81 | ppl   122.13\n",
      "| epoch  23 |  1200/ 3481 batches | lr 5.00 | ms/batch  9.55 | loss  4.76 | ppl   116.37\n",
      "| epoch  23 |  1400/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.80 | ppl   120.96\n",
      "| epoch  23 |  1600/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.82 | ppl   124.17\n",
      "| epoch  23 |  1800/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.84 | ppl   126.65\n",
      "| epoch  23 |  2000/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.81 | ppl   122.87\n",
      "| epoch  23 |  2200/ 3481 batches | lr 5.00 | ms/batch  9.55 | loss  4.84 | ppl   126.18\n",
      "| epoch  23 |  2400/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.80 | ppl   122.01\n",
      "| epoch  23 |  2600/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.69 | ppl   109.05\n",
      "| epoch  23 |  2800/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.77 | ppl   117.53\n",
      "| epoch  23 |  3000/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.78 | ppl   118.94\n",
      "| epoch  23 |  3200/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.77 | ppl   117.65\n",
      "| epoch  23 |  3400/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.71 | ppl   111.37\n",
      "val loss 4.758434226245176\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  23 | time: 34.45s | valid loss  4.76 | valid ppl   116.56\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  24 |   200/ 3481 batches | lr 5.00 | ms/batch 11.38 | loss  4.81 | ppl   123.07\n",
      "| epoch  24 |   400/ 3481 batches | lr 5.00 | ms/batch  9.39 | loss  4.83 | ppl   125.58\n",
      "| epoch  24 |   600/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.76 | ppl   117.30\n",
      "| epoch  24 |   800/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.69 | ppl   108.46\n",
      "| epoch  24 |  1000/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.80 | ppl   121.10\n",
      "| epoch  24 |  1200/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.75 | ppl   115.46\n",
      "| epoch  24 |  1400/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.79 | ppl   120.11\n",
      "| epoch  24 |  1600/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.82 | ppl   123.68\n",
      "| epoch  24 |  1800/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.83 | ppl   125.10\n",
      "| epoch  24 |  2000/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.81 | ppl   122.61\n",
      "| epoch  24 |  2200/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.83 | ppl   124.81\n",
      "| epoch  24 |  2400/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.80 | ppl   121.11\n",
      "| epoch  24 |  2600/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.69 | ppl   108.43\n",
      "| epoch  24 |  2800/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.76 | ppl   116.39\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| epoch  24 |  3000/ 3481 batches | lr 5.00 | ms/batch  9.46 | loss  4.77 | ppl   118.44\n",
      "| epoch  24 |  3200/ 3481 batches | lr 5.00 | ms/batch  9.50 | loss  4.76 | ppl   116.71\n",
      "| epoch  24 |  3400/ 3481 batches | lr 5.00 | ms/batch  9.47 | loss  4.70 | ppl   110.10\n",
      "val loss 4.750752389266679\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  24 | time: 34.43s | valid loss  4.75 | valid ppl   115.67\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  25 |   200/ 3481 batches | lr 5.00 | ms/batch 11.33 | loss  4.80 | ppl   121.35\n",
      "| epoch  25 |   400/ 3481 batches | lr 5.00 | ms/batch  9.63 | loss  4.82 | ppl   124.49\n",
      "| epoch  25 |   600/ 3481 batches | lr 5.00 | ms/batch  9.37 | loss  4.76 | ppl   117.28\n",
      "| epoch  25 |   800/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.68 | ppl   107.52\n",
      "| epoch  25 |  1000/ 3481 batches | lr 5.00 | ms/batch  9.41 | loss  4.79 | ppl   120.13\n",
      "| epoch  25 |  1200/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.74 | ppl   114.32\n",
      "| epoch  25 |  1400/ 3481 batches | lr 5.00 | ms/batch  9.50 | loss  4.78 | ppl   118.66\n",
      "| epoch  25 |  1600/ 3481 batches | lr 5.00 | ms/batch  9.37 | loss  4.81 | ppl   122.47\n",
      "| epoch  25 |  1800/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.82 | ppl   124.36\n",
      "| epoch  25 |  2000/ 3481 batches | lr 5.00 | ms/batch  9.40 | loss  4.80 | ppl   120.97\n",
      "| epoch  25 |  2200/ 3481 batches | lr 5.00 | ms/batch  9.44 | loss  4.82 | ppl   124.42\n",
      "| epoch  25 |  2400/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.79 | ppl   120.00\n",
      "| epoch  25 |  2600/ 3481 batches | lr 5.00 | ms/batch  9.45 | loss  4.67 | ppl   107.19\n",
      "| epoch  25 |  2800/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.76 | ppl   116.35\n",
      "| epoch  25 |  3000/ 3481 batches | lr 5.00 | ms/batch  9.42 | loss  4.77 | ppl   117.64\n",
      "| epoch  25 |  3200/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.76 | ppl   116.43\n",
      "| epoch  25 |  3400/ 3481 batches | lr 5.00 | ms/batch  9.43 | loss  4.69 | ppl   109.23\n",
      "val loss 4.752800998782393\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  25 | time: 34.37s | valid loss  4.75 | valid ppl   115.91\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  26 |   200/ 3481 batches | lr 1.25 | ms/batch 11.37 | loss  4.83 | ppl   124.65\n",
      "| epoch  26 |   400/ 3481 batches | lr 1.25 | ms/batch  9.52 | loss  4.85 | ppl   128.35\n",
      "| epoch  26 |   600/ 3481 batches | lr 1.25 | ms/batch  9.40 | loss  4.78 | ppl   119.19\n",
      "| epoch  26 |   800/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.71 | ppl   111.00\n",
      "| epoch  26 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.49 | loss  4.81 | ppl   123.31\n",
      "| epoch  26 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.75 | ppl   115.87\n",
      "| epoch  26 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.79 | ppl   119.73\n",
      "| epoch  26 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.37 | loss  4.81 | ppl   122.90\n",
      "| epoch  26 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.82 | ppl   123.83\n",
      "| epoch  26 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.80 | ppl   121.05\n",
      "| epoch  26 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.81 | ppl   122.40\n",
      "| epoch  26 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.79 | ppl   119.77\n",
      "| epoch  26 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.66 | ppl   105.99\n",
      "| epoch  26 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.73 | ppl   113.22\n",
      "| epoch  26 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.74 | ppl   114.37\n",
      "| epoch  26 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.39 | loss  4.73 | ppl   112.79\n",
      "| epoch  26 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.66 | ppl   105.98\n",
      "val loss 4.718070839459658\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  26 | time: 34.40s | valid loss  4.72 | valid ppl   111.95\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  27 |   200/ 3481 batches | lr 1.25 | ms/batch 11.38 | loss  4.80 | ppl   121.49\n",
      "| epoch  27 |   400/ 3481 batches | lr 1.25 | ms/batch  9.40 | loss  4.83 | ppl   124.59\n",
      "| epoch  27 |   600/ 3481 batches | lr 1.25 | ms/batch  9.53 | loss  4.76 | ppl   116.32\n",
      "| epoch  27 |   800/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.68 | ppl   107.52\n",
      "| epoch  27 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.79 | ppl   120.14\n",
      "| epoch  27 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.74 | ppl   114.21\n",
      "| epoch  27 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.77 | ppl   117.61\n",
      "| epoch  27 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.80 | ppl   121.12\n",
      "| epoch  27 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.81 | ppl   122.86\n",
      "| epoch  27 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.78 | ppl   119.32\n",
      "| epoch  27 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.80 | ppl   121.58\n",
      "| epoch  27 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.40 | loss  4.77 | ppl   118.43\n",
      "| epoch  27 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.65 | ppl   104.84\n",
      "| epoch  27 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.72 | ppl   112.47\n",
      "| epoch  27 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.52 | loss  4.74 | ppl   114.80\n",
      "| epoch  27 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.73 | ppl   113.02\n",
      "| epoch  27 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.67 | ppl   106.37\n",
      "val loss 4.714983387130123\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  27 | time: 34.43s | valid loss  4.71 | valid ppl   111.61\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  28 |   200/ 3481 batches | lr 1.25 | ms/batch 11.37 | loss  4.79 | ppl   120.64\n",
      "| epoch  28 |   400/ 3481 batches | lr 1.25 | ms/batch  9.40 | loss  4.81 | ppl   123.15\n",
      "| epoch  28 |   600/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.75 | ppl   115.34\n",
      "| epoch  28 |   800/ 3481 batches | lr 1.25 | ms/batch  9.51 | loss  4.66 | ppl   106.03\n",
      "| epoch  28 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.79 | ppl   119.79\n",
      "| epoch  28 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.36 | loss  4.73 | ppl   113.29\n",
      "| epoch  28 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.76 | ppl   117.10\n",
      "| epoch  28 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.79 | ppl   120.11\n",
      "| epoch  28 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.80 | ppl   121.05\n",
      "| epoch  28 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.77 | ppl   118.50\n",
      "| epoch  28 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.79 | ppl   120.73\n",
      "| epoch  28 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.77 | ppl   118.26\n",
      "| epoch  28 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.65 | ppl   104.72\n",
      "| epoch  28 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.72 | ppl   112.19\n",
      "| epoch  28 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.73 | ppl   113.77\n",
      "| epoch  28 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.72 | ppl   112.63\n",
      "| epoch  28 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.67 | ppl   106.26\n",
      "val loss 4.7120169057847825\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  28 | time: 34.40s | valid loss  4.71 | valid ppl   111.28\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  29 |   200/ 3481 batches | lr 1.25 | ms/batch 11.36 | loss  4.78 | ppl   119.38\n",
      "| epoch  29 |   400/ 3481 batches | lr 1.25 | ms/batch  9.38 | loss  4.81 | ppl   122.63\n",
      "| epoch  29 |   600/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.74 | ppl   114.56\n",
      "| epoch  29 |   800/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.66 | ppl   105.33\n",
      "| epoch  29 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.37 | loss  4.77 | ppl   118.43\n",
      "| epoch  29 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.73 | ppl   112.83\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| epoch  29 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.75 | ppl   116.07\n",
      "| epoch  29 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.79 | ppl   119.93\n",
      "| epoch  29 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.40 | loss  4.79 | ppl   120.43\n",
      "| epoch  29 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.78 | ppl   118.58\n",
      "| epoch  29 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.79 | ppl   120.63\n",
      "| epoch  29 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.76 | ppl   117.32\n",
      "| epoch  29 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.65 | ppl   104.57\n",
      "| epoch  29 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.72 | ppl   112.63\n",
      "| epoch  29 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.74 | ppl   114.07\n",
      "| epoch  29 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.72 | ppl   111.85\n",
      "| epoch  29 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.40 | loss  4.66 | ppl   105.67\n",
      "val loss 4.71086929677449\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  29 | time: 34.32s | valid loss  4.71 | valid ppl   111.15\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  30 |   200/ 3481 batches | lr 1.25 | ms/batch 12.26 | loss  4.78 | ppl   119.05\n",
      "| epoch  30 |   400/ 3481 batches | lr 1.25 | ms/batch  9.49 | loss  4.80 | ppl   121.80\n",
      "| epoch  30 |   600/ 3481 batches | lr 1.25 | ms/batch  9.35 | loss  4.74 | ppl   114.23\n",
      "| epoch  30 |   800/ 3481 batches | lr 1.25 | ms/batch  9.36 | loss  4.65 | ppl   104.93\n",
      "| epoch  30 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.77 | ppl   117.55\n",
      "| epoch  30 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.71 | ppl   111.48\n",
      "| epoch  30 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.75 | ppl   115.87\n",
      "| epoch  30 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.78 | ppl   119.63\n",
      "| epoch  30 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.79 | ppl   120.27\n",
      "| epoch  30 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.76 | ppl   117.29\n",
      "| epoch  30 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.79 | ppl   120.27\n",
      "| epoch  30 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.49 | loss  4.76 | ppl   117.07\n",
      "| epoch  30 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.64 | ppl   103.91\n",
      "| epoch  30 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.71 | ppl   111.48\n",
      "| epoch  30 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.73 | ppl   113.58\n",
      "| epoch  30 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.50 | loss  4.72 | ppl   111.96\n",
      "| epoch  30 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.66 | ppl   105.84\n",
      "val loss 4.708646230587208\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  30 | time: 34.59s | valid loss  4.71 | valid ppl   110.90\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  31 |   200/ 3481 batches | lr 1.25 | ms/batch 11.39 | loss  4.78 | ppl   118.70\n",
      "| epoch  31 |   400/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.80 | ppl   121.60\n",
      "| epoch  31 |   600/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.73 | ppl   113.65\n",
      "| epoch  31 |   800/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.65 | ppl   104.45\n",
      "| epoch  31 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.77 | ppl   117.39\n",
      "| epoch  31 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.53 | loss  4.71 | ppl   111.14\n",
      "| epoch  31 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.75 | ppl   115.90\n",
      "| epoch  31 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.59 | loss  4.78 | ppl   118.96\n",
      "| epoch  31 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.79 | ppl   120.21\n",
      "| epoch  31 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.77 | ppl   117.76\n",
      "| epoch  31 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.78 | ppl   119.70\n",
      "| epoch  31 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.76 | ppl   117.14\n",
      "| epoch  31 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.40 | loss  4.65 | ppl   104.06\n",
      "| epoch  31 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.71 | ppl   111.45\n",
      "| epoch  31 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.74 | ppl   113.90\n",
      "| epoch  31 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.72 | ppl   112.04\n",
      "| epoch  31 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.66 | ppl   105.16\n",
      "val loss 4.707807330913435\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  31 | time: 34.49s | valid loss  4.71 | valid ppl   110.81\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  32 |   200/ 3481 batches | lr 1.25 | ms/batch 11.54 | loss  4.77 | ppl   117.97\n",
      "| epoch  32 |   400/ 3481 batches | lr 1.25 | ms/batch  9.39 | loss  4.79 | ppl   120.90\n",
      "| epoch  32 |   600/ 3481 batches | lr 1.25 | ms/batch  9.56 | loss  4.73 | ppl   113.41\n",
      "| epoch  32 |   800/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.65 | ppl   104.61\n",
      "| epoch  32 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.76 | ppl   117.12\n",
      "| epoch  32 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.71 | ppl   111.50\n",
      "| epoch  32 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.56 | loss  4.74 | ppl   114.59\n",
      "| epoch  32 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.77 | ppl   118.33\n",
      "| epoch  32 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.56 | loss  4.79 | ppl   119.90\n",
      "| epoch  32 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.76 | ppl   117.24\n",
      "| epoch  32 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.40 | loss  4.78 | ppl   119.62\n",
      "| epoch  32 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.35 | loss  4.76 | ppl   116.52\n",
      "| epoch  32 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.64 | ppl   103.52\n",
      "| epoch  32 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.34 | loss  4.71 | ppl   111.25\n",
      "| epoch  32 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.36 | loss  4.73 | ppl   112.77\n",
      "| epoch  32 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.38 | loss  4.72 | ppl   112.16\n",
      "| epoch  32 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.38 | loss  4.66 | ppl   105.41\n",
      "val loss 4.706599415893218\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  32 | time: 34.39s | valid loss  4.71 | valid ppl   110.68\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  33 |   200/ 3481 batches | lr 1.25 | ms/batch 11.32 | loss  4.77 | ppl   117.80\n",
      "| epoch  33 |   400/ 3481 batches | lr 1.25 | ms/batch  9.39 | loss  4.79 | ppl   120.17\n",
      "| epoch  33 |   600/ 3481 batches | lr 1.25 | ms/batch  9.51 | loss  4.72 | ppl   112.64\n",
      "| epoch  33 |   800/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.64 | ppl   103.26\n",
      "| epoch  33 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.75 | ppl   116.03\n",
      "| epoch  33 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.70 | ppl   110.50\n",
      "| epoch  33 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.74 | ppl   114.72\n",
      "| epoch  33 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.50 | loss  4.77 | ppl   117.95\n",
      "| epoch  33 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.78 | ppl   118.98\n",
      "| epoch  33 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.76 | ppl   116.62\n",
      "| epoch  33 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.78 | ppl   119.32\n",
      "| epoch  33 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.76 | ppl   116.23\n",
      "| epoch  33 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.64 | ppl   103.13\n",
      "| epoch  33 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.71 | ppl   111.20\n",
      "| epoch  33 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.73 | ppl   112.77\n",
      "| epoch  33 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.72 | ppl   111.93\n",
      "| epoch  33 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.66 | ppl   105.67\n",
      "val loss 4.70495966676622\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  33 | time: 34.43s | valid loss  4.70 | valid ppl   110.49\n",
      "-----------------------------------------------------------------------------------------\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| epoch  34 |   200/ 3481 batches | lr 1.25 | ms/batch 11.42 | loss  4.77 | ppl   117.71\n",
      "| epoch  34 |   400/ 3481 batches | lr 1.25 | ms/batch  9.54 | loss  4.79 | ppl   120.21\n",
      "| epoch  34 |   600/ 3481 batches | lr 1.25 | ms/batch  9.40 | loss  4.72 | ppl   112.28\n",
      "| epoch  34 |   800/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.64 | ppl   103.57\n",
      "| epoch  34 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.49 | loss  4.76 | ppl   116.32\n",
      "| epoch  34 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.52 | loss  4.70 | ppl   110.11\n",
      "| epoch  34 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.74 | ppl   114.60\n",
      "| epoch  34 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.77 | ppl   117.53\n",
      "| epoch  34 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.78 | ppl   119.63\n",
      "| epoch  34 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.76 | ppl   116.54\n",
      "| epoch  34 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.78 | ppl   118.80\n",
      "| epoch  34 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.76 | ppl   116.59\n",
      "| epoch  34 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.64 | ppl   103.35\n",
      "| epoch  34 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.51 | loss  4.71 | ppl   111.13\n",
      "| epoch  34 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.72 | ppl   112.25\n",
      "| epoch  34 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.53 | loss  4.71 | ppl   111.22\n",
      "| epoch  34 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.66 | ppl   105.29\n",
      "val loss 4.704262916972983\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  34 | time: 34.48s | valid loss  4.70 | valid ppl   110.42\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  35 |   200/ 3481 batches | lr 1.25 | ms/batch 11.43 | loss  4.76 | ppl   116.18\n",
      "| epoch  35 |   400/ 3481 batches | lr 1.25 | ms/batch  9.50 | loss  4.78 | ppl   119.37\n",
      "| epoch  35 |   600/ 3481 batches | lr 1.25 | ms/batch  9.54 | loss  4.72 | ppl   112.01\n",
      "| epoch  35 |   800/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.64 | ppl   103.24\n",
      "| epoch  35 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.75 | ppl   115.29\n",
      "| epoch  35 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.70 | ppl   110.06\n",
      "| epoch  35 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.74 | ppl   114.11\n",
      "| epoch  35 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.77 | ppl   117.65\n",
      "| epoch  35 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.78 | ppl   118.52\n",
      "| epoch  35 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.75 | ppl   115.83\n",
      "| epoch  35 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.78 | ppl   118.93\n",
      "| epoch  35 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.75 | ppl   115.64\n",
      "| epoch  35 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.63 | ppl   102.92\n",
      "| epoch  35 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.71 | ppl   111.00\n",
      "| epoch  35 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.52 | loss  4.72 | ppl   112.66\n",
      "| epoch  35 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.71 | ppl   110.94\n",
      "| epoch  35 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.66 | ppl   105.14\n",
      "val loss 4.703403915295901\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  35 | time: 34.48s | valid loss  4.70 | valid ppl   110.32\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  36 |   200/ 3481 batches | lr 1.25 | ms/batch 11.37 | loss  4.76 | ppl   116.25\n",
      "| epoch  36 |   400/ 3481 batches | lr 1.25 | ms/batch  9.38 | loss  4.78 | ppl   119.53\n",
      "| epoch  36 |   600/ 3481 batches | lr 1.25 | ms/batch  9.50 | loss  4.72 | ppl   111.84\n",
      "| epoch  36 |   800/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.63 | ppl   102.59\n",
      "| epoch  36 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.75 | ppl   115.27\n",
      "| epoch  36 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.70 | ppl   109.95\n",
      "| epoch  36 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.73 | ppl   113.82\n",
      "| epoch  36 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.76 | ppl   117.17\n",
      "| epoch  36 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.77 | ppl   118.31\n",
      "| epoch  36 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.76 | ppl   116.32\n",
      "| epoch  36 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.34 | loss  4.77 | ppl   118.19\n",
      "| epoch  36 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.75 | ppl   115.49\n",
      "| epoch  36 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.63 | ppl   102.56\n",
      "| epoch  36 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.70 | ppl   110.31\n",
      "| epoch  36 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.72 | ppl   111.86\n",
      "| epoch  36 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.71 | ppl   111.16\n",
      "| epoch  36 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.66 | ppl   105.20\n",
      "val loss 4.701550582383753\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  36 | time: 34.38s | valid loss  4.70 | valid ppl   110.12\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  37 |   200/ 3481 batches | lr 1.25 | ms/batch 11.53 | loss  4.76 | ppl   116.34\n",
      "| epoch  37 |   400/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.77 | ppl   118.10\n",
      "| epoch  37 |   600/ 3481 batches | lr 1.25 | ms/batch  9.55 | loss  4.71 | ppl   111.31\n",
      "| epoch  37 |   800/ 3481 batches | lr 1.25 | ms/batch  9.55 | loss  4.63 | ppl   102.10\n",
      "| epoch  37 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.74 | ppl   114.93\n",
      "| epoch  37 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.46 | loss  4.70 | ppl   109.52\n",
      "| epoch  37 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.73 | ppl   113.46\n",
      "| epoch  37 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.76 | ppl   117.03\n",
      "| epoch  37 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.77 | ppl   118.37\n",
      "| epoch  37 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.75 | ppl   115.69\n",
      "| epoch  37 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.77 | ppl   117.48\n",
      "| epoch  37 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.75 | ppl   115.43\n",
      "| epoch  37 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.63 | ppl   102.46\n",
      "| epoch  37 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.70 | ppl   110.36\n",
      "| epoch  37 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.72 | ppl   112.00\n",
      "| epoch  37 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.70 | ppl   110.25\n",
      "| epoch  37 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.65 | ppl   104.12\n",
      "val loss 4.701002084864914\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  37 | time: 34.46s | valid loss  4.70 | valid ppl   110.06\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  38 |   200/ 3481 batches | lr 1.25 | ms/batch 11.36 | loss  4.75 | ppl   115.69\n",
      "| epoch  38 |   400/ 3481 batches | lr 1.25 | ms/batch  9.42 | loss  4.77 | ppl   118.51\n",
      "| epoch  38 |   600/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.71 | ppl   110.68\n",
      "| epoch  38 |   800/ 3481 batches | lr 1.25 | ms/batch  9.43 | loss  4.62 | ppl   101.78\n",
      "| epoch  38 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.52 | loss  4.74 | ppl   114.54\n",
      "| epoch  38 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.50 | loss  4.69 | ppl   108.90\n",
      "| epoch  38 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.54 | loss  4.73 | ppl   113.52\n",
      "| epoch  38 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.75 | ppl   115.96\n",
      "| epoch  38 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.48 | loss  4.77 | ppl   118.07\n",
      "| epoch  38 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.75 | ppl   115.87\n",
      "| epoch  38 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.49 | loss  4.77 | ppl   117.60\n",
      "| epoch  38 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.75 | ppl   115.15\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| epoch  38 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.44 | loss  4.63 | ppl   102.01\n",
      "| epoch  38 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.45 | loss  4.70 | ppl   109.80\n",
      "| epoch  38 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.47 | loss  4.71 | ppl   111.37\n",
      "| epoch  38 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.52 | loss  4.70 | ppl   110.49\n",
      "| epoch  38 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.64 | ppl   103.93\n",
      "val loss 4.699144444380629\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  38 | time: 34.47s | valid loss  4.70 | valid ppl   109.85\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  39 |   200/ 3481 batches | lr 1.25 | ms/batch 11.26 | loss  4.75 | ppl   115.51\n",
      "| epoch  39 |   400/ 3481 batches | lr 1.25 | ms/batch  9.37 | loss  4.77 | ppl   117.94\n",
      "| epoch  39 |   600/ 3481 batches | lr 1.25 | ms/batch  9.37 | loss  4.71 | ppl   111.25\n",
      "| epoch  39 |   800/ 3481 batches | lr 1.25 | ms/batch  9.35 | loss  4.63 | ppl   102.06\n",
      "| epoch  39 |  1000/ 3481 batches | lr 1.25 | ms/batch  9.49 | loss  4.74 | ppl   114.03\n",
      "| epoch  39 |  1200/ 3481 batches | lr 1.25 | ms/batch  9.33 | loss  4.69 | ppl   108.78\n",
      "| epoch  39 |  1400/ 3481 batches | lr 1.25 | ms/batch  9.41 | loss  4.73 | ppl   113.04\n",
      "| epoch  39 |  1600/ 3481 batches | lr 1.25 | ms/batch  9.36 | loss  4.75 | ppl   116.11\n",
      "| epoch  39 |  1800/ 3481 batches | lr 1.25 | ms/batch  9.36 | loss  4.77 | ppl   118.10\n",
      "| epoch  39 |  2000/ 3481 batches | lr 1.25 | ms/batch  9.38 | loss  4.75 | ppl   115.24\n",
      "| epoch  39 |  2200/ 3481 batches | lr 1.25 | ms/batch  9.35 | loss  4.77 | ppl   117.64\n",
      "| epoch  39 |  2400/ 3481 batches | lr 1.25 | ms/batch  9.36 | loss  4.74 | ppl   114.32\n",
      "| epoch  39 |  2600/ 3481 batches | lr 1.25 | ms/batch  9.38 | loss  4.63 | ppl   102.02\n",
      "| epoch  39 |  2800/ 3481 batches | lr 1.25 | ms/batch  9.36 | loss  4.70 | ppl   109.74\n",
      "| epoch  39 |  3000/ 3481 batches | lr 1.25 | ms/batch  9.36 | loss  4.72 | ppl   111.67\n",
      "| epoch  39 |  3200/ 3481 batches | lr 1.25 | ms/batch  9.33 | loss  4.70 | ppl   110.02\n",
      "| epoch  39 |  3400/ 3481 batches | lr 1.25 | ms/batch  9.38 | loss  4.65 | ppl   104.63\n",
      "val loss 4.700592147583165\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  39 | time: 34.16s | valid loss  4.70 | valid ppl   110.01\n",
      "-----------------------------------------------------------------------------------------\n",
      "| epoch  40 |   200/ 3481 batches | lr 0.31 | ms/batch 11.47 | loss  4.77 | ppl   117.40\n",
      "| epoch  40 |   400/ 3481 batches | lr 0.31 | ms/batch  9.56 | loss  4.81 | ppl   122.19\n",
      "| epoch  40 |   600/ 3481 batches | lr 0.31 | ms/batch  9.43 | loss  4.73 | ppl   113.08\n",
      "| epoch  40 |   800/ 3481 batches | lr 0.31 | ms/batch  9.48 | loss  4.65 | ppl   104.77\n",
      "| epoch  40 |  1000/ 3481 batches | lr 0.31 | ms/batch  9.42 | loss  4.76 | ppl   116.42\n",
      "| epoch  40 |  1200/ 3481 batches | lr 0.31 | ms/batch  9.55 | loss  4.70 | ppl   109.77\n",
      "| epoch  40 |  1400/ 3481 batches | lr 0.31 | ms/batch  9.41 | loss  4.74 | ppl   114.61\n",
      "| epoch  40 |  1600/ 3481 batches | lr 0.31 | ms/batch  9.47 | loss  4.77 | ppl   117.65\n",
      "| epoch  40 |  1800/ 3481 batches | lr 0.31 | ms/batch  9.46 | loss  4.77 | ppl   118.42\n",
      "| epoch  40 |  2000/ 3481 batches | lr 0.31 | ms/batch  9.44 | loss  4.76 | ppl   116.31\n",
      "| epoch  40 |  2200/ 3481 batches | lr 0.31 | ms/batch  9.46 | loss  4.77 | ppl   117.52\n",
      "| epoch  40 |  2400/ 3481 batches | lr 0.31 | ms/batch  9.43 | loss  4.74 | ppl   114.06\n",
      "| epoch  40 |  2600/ 3481 batches | lr 0.31 | ms/batch  9.44 | loss  4.62 | ppl   101.72\n",
      "| epoch  40 |  2800/ 3481 batches | lr 0.31 | ms/batch  9.44 | loss  4.69 | ppl   109.30\n",
      "| epoch  40 |  3000/ 3481 batches | lr 0.31 | ms/batch  9.47 | loss  4.71 | ppl   111.51\n",
      "| epoch  40 |  3200/ 3481 batches | lr 0.31 | ms/batch  9.43 | loss  4.70 | ppl   109.65\n",
      "| epoch  40 |  3400/ 3481 batches | lr 0.31 | ms/batch  9.51 | loss  4.63 | ppl   102.43\n",
      "val loss 4.686332647950745\n",
      "-----------------------------------------------------------------------------------------\n",
      "| end of epoch  40 | time: 34.50s | valid loss  4.69 | valid ppl   108.45\n",
      "-----------------------------------------------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "# Loop over epochs.\n",
    "best_val_loss = None\n",
    "epochs = 40\n",
    "\n",
    "for epoch in range(1, epochs+1):\n",
    "    epoch_start_time = time.time()\n",
    "    trainf()\n",
    "    val_loss = evaluate(valid_iter)\n",
    "    print('-' * 89)\n",
    "    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '\n",
    "        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),\n",
    "                                   val_loss, math.exp(val_loss)))\n",
    "    print('-' * 89)\n",
    "    if not best_val_loss or val_loss < best_val_loss:\n",
    "        best_val_loss = val_loss\n",
    "    else:\n",
    "        # Anneal the learning rate if no improvement has been seen in the validation dataset.\n",
    "        lr /= 4.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
